This is transcript-level analysis. Importantly, it is also possible to use aggregation transcript p-values to perform gene differential expression, as introduced in Yi et al., 2017. In this framework, differential expression is performed on transcripts as usual, but then transcript-level p-values can be aggregated to obtain gene-level p values. The results are therefore different than the old method of gene_mode=T. Also see this walkthrough.
The original FASTQ files (50 bp, single-end, reverse stranded) were processed with kallisto using a custom transcriptome, which is a combination of the REFSEQ mRNA sequences AND RepBase D. melanogaster TE seqeunces (refGene_plus_repbase)
./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL25 YL25_shW_RNAseq_1st.fq.gz
./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL26 YL26_shSmt3_RNAseq_1st.fq.gz
./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL27 YL27_shW_RNAseq_2nd.fq.gz
./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL28 YL28_shSmt3_RNAseq_2nd.fq.gz
Preparing sleuth object
s2c<-data.frame(sample=c("YL25","YL27","YL26","YL28"),condition=c("ctrl", "ctrl","KD","KD"),
path=c("../kallisto/YL25/","../kallisto/YL27/", "../kallisto/YL26/","../kallisto/YL28/"))
s2c$path<-as.character(s2c$path)
print(s2c)
## sample condition path
## 1 YL25 ctrl ../kallisto/YL25/
## 2 YL27 ctrl ../kallisto/YL27/
## 3 YL26 KD ../kallisto/YL26/
## 4 YL28 KD ../kallisto/YL28/
#this is to create a transcript to gene name dictionary - I just transform each fasta header.
t<-read.table("../kallisto/YL26/abundance.tsv", header=T, as.is=T)
t2g<-data.frame("target_id"=t$target_id, gene=sub(":CDS.*","",sub(".*gene=","", t$target_id)))
t2g$coding<-grepl("CDS", t2g$target_id)
t2g$TE<-grepl("\\|", t2g$target_id)
# number of elements of different type: with CDS annotation, RepBase entries (TE), and none (ncRNAs or genes with no CDS)
t2g %>% select(gene,coding, TE) %>% distinct() %>% group_by(coding, TE) %>% tally()
## # A tibble: 3 x 3
## # Groups: coding [2]
## coding TE n
## <lgl> <lgl> <int>
## 1 FALSE FALSE 2673
## 2 FALSE TRUE 238
## 3 TRUE FALSE 13939
# make "so", note that aggregation column is supplied, but gene_mode is FALSE
so <- sleuth_prep(s2c, ~condition, extra_bootstrap_summary=T, read_bootstrap_tpm = TRUE, aggregation_column = 'gene', target_mapping = t2g, gene_mode=FALSE)
## reading in kallisto results
## dropping unused factor levels
## ....
## normalizing est_counts
## 18141 targets passed the filter
## normalizing tpm
## merging in metadata
## summarizing bootstraps
## ....
Here, we can inspect the input data table. This is in transcripts as aggregation is for the p-values.
units
## [1] "est_counts"
table1<-sleuth_to_matrix(so, which_df = "obs_norm", which_units = units) %>% as.data.frame() %>%
rename(shW.1=YL25, shW.2=YL27, shSmt3.1=YL26, shSmt3.2=YL28)
table1 %>% round(digits=2) %>% datatable(options=list(pageLength=5))
## Warning in instance$preRenderHook(instance): It seems your data is too big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
Running sleuth Likelihood Ratio test (LRT), and displaying a table of significantly changed genes/tx, qval<0.05. This table DOES NOT have information about up- or down- regulation.
so <- sleuth_fit(so, ~condition,'full')
so <- sleuth_fit(so, ~1, 'reduced')
so <- sleuth_lrt(so, 'reduced', 'full')
sleuth_results_LRT_gene <- sleuth_results(so, 'reduced:full', test_type = 'lrt', show_all = TRUE)
sleuth_results_LRT_tx <-sleuth_results(so, 'reduced:full', test_type = 'lrt', show_all = TRUE, pval_aggregate = F)
#gene aggregated table
dplyr::filter(sleuth_results_LRT_gene, qval <= 0.05) %>% datatable(options = list(pageLength=5))
#tx aggregated table
dplyr::filter(sleuth_results_LRT_tx, qval <= 0.05) %>% datatable(options = list(pageLength=5))
Running wald test & plotting MA plot. Note that there are beta-vals only for the tx-based table where the diff.expr was performed, therefore, the MA-plot is transcript-based.
Note: the displayed table shows all results, however the MA plot does not show txs which did not pass filters, and Inf values.
so <- sleuth_wt(so, 'conditionKD')
sleuth_results_wald_gene <- sleuth_results(so, 'conditionKD', show_all = TRUE)
sleuth_results_wald_tx<- sleuth_results(so, 'conditionKD', show_all = TRUE, pval_aggregate = F)
#gene aggregated table
sleuth_results_wald_gene %>% datatable(options = list(pageLength=5))
#tx aggregated table
sleuth_results_wald_tx %>% datatable(options = list(pageLength=5))
# MA plot, transcript analysis. Adding count data for later.
table.counts<-sleuth_to_matrix(so, which_df = "obs_norm", which_units = units) %>% as.data.frame() %>% rename(shW.1=YL25, shW.2=YL27, shSmt3.1=YL26, shSmt3.2=YL28) %>% tibble::rownames_to_column("target_id") %>% mutate(x=(shW.1+shW.2)/2, y=(shSmt3.1+shSmt3.2)/2) %>% mutate(log2FC=log2(y/x))
res<-sleuth_results(so, 'conditionKD', show_all = FALSE, pval_aggregate = F) %>% left_join(table.counts, by="target_id") %>% mutate(signFC=(qval<0.05 & abs(log2FC)>1), significant=qval<0.05) %>% arrange(qval)
p<-ggplot(res, aes(mean_obs, b)) + theme_classic() + ggtitle("MA plot of transcripts and TEs, sleuth")
p <- p + xlab(paste("mean( log(counts+ 0.5 ) )"))
p <- p + ylab(paste0("beta: conditionKD"))
p <- p + geom_text(data=(sleuth_results_wald_tx %>% filter(str_detect(target_id,"(smt3|^R1_DM|^R2_DM)"))), aes(mean_obs, b, label=sub(".*gene=(.*):CDS.*", "\\1",sub("_DM.*","",target_id))), position = position_nudge(x=0.5))
p1 <- p + geom_point(aes(colour = significant), alpha = 0.2) + scale_colour_manual(values = c("black", "green4"))
p1

Fig. MA plot from sleuth analysis, Wald test, transcript-based. All transcripts with qvalue<0.05 are highlighted (warning: beta != log2FC).
Below it is the same MA plot, but I only marked genes with >2 fold change, calculated from the original values in the sleuth table above.
p2 <- p + geom_point(aes(colour = signFC), alpha = 0.2) + scale_colour_manual(values = c("black", "red"))
p2

Fig. MA plot from sleuth analysis, transcript-level, Wald test. (warning: beta != log2FC). Only genes with qval<0.05 & log2FC (from kallisto numbers) are highlighted.
This is an alternative visualization of the data, where we do a scatter plot in control vs knockdown. We can choose to select the dots based on qvalues either from the Wald test, or the LTR test. The test choice does not change the interpretation of the data in the light of the R1, R2 up-regulation compared to genes.
#values tables
table.tpm<-sleuth_to_matrix(so, which_df = "obs_norm", which_units = "tpm") %>% as.data.frame() %>% rename(shW.1=YL25, shW.2=YL27, shSmt3.1=YL26, shSmt3.2=YL28) %>% tibble::rownames_to_column("target_id") %>% mutate(x=(shW.1+shW.2)/2, y=(shSmt3.1+shSmt3.2)/2) %>% mutate(log2FC=log2(y/x))
tab.raw<-full_join(table.counts, table.tpm, by="target_id",suffix=c(".count",".tpm"))
### stats tables
tab.wald<-sleuth_results(so, 'conditionKD', show_all = TRUE, pval_aggregate = F) %>% arrange(target_id) # wald test
tab.lrt<-sleuth_results(so, 'reduced:full', test_type = 'lrt', show_all = TRUE, pval_aggregate = F) %>% arrange(target_id) #lrt test
tab.stat<-full_join(tab.wald, tab.lrt, by="target_id", suffix=c(".wald",".lrt"))
### all data together, fold change is taken from the est_counts
tab.plot<-full_join(tab.stat, tab.raw, by="target_id") %>% mutate(signWald=(abs(log2FC.count)>1 & qval.wald<0.05), signLRT=(abs(log2FC.count)>1 & qval.lrt<0.05))
### plot
p.sub<-geom_text(data=(tab.plot %>% filter(str_detect(target_id,"(smt3|^R1_DM|^R2_DM)"))), aes(log2(x.tpm+1), log2(y.tpm+1), label=sub(".*gene=(.*):CDS.*", "\\1",sub("_DM.*","",target_id))), position = position_nudge(x=1))
top20.x<-tail(sort(tab.plot$x.tpm),20)[1]
top20.y<-tail(sort(tab.plot$y.tpm),20)[1]
p<-ggplot(tab.plot, aes(log2(x.tpm+1), log2(y.tpm+1))) + theme_classic() + theme(legend.position="left") + p.sub
p<-p + scale_color_manual(values=c("black", "beige", "green4"))
p<-p + xlab(paste("shW, log2 (av.tpm + 1) ")) + ylab(paste("shSmt3, log2 (av.tpm + 1) "))
p<-p + geom_hline(yintercept = log2(top20.y+1), linetype="dotted") + geom_vline(xintercept = log2(top20.x+1), linetype="dotted")
p1<-p + geom_point(aes(colour = paste(signWald)), alpha = 0.4) + geom_abline(intercept = 0, slope=1, col="blue")
p2<-p + geom_point(aes(colour = paste(signLRT)), alpha = 0.4) + geom_abline(intercept = 0, slope=1, col="blue")
p3<-ggExtra::ggMarginal(p1, type="histogram", size=3, fill = "grey70")
p4<-ggExtra::ggMarginal(p2, type="histogram", size=3, fill = "grey70")
gridExtra::grid.arrange(p3, p4, nrow=1)

Figure. Scatter plots of transcript expression in control vs KD. The dotted line denotes area that contains the top 20 most highly expressed genes in the corresponding condition. Values log2FC>2 and significant: qvalue (Wald or LRT) < 0.05 are highlighted. The histograms above the y-axis and x-axis show the distribution of values. Blue line shows 1:1 ratio.
Note: Here, pale dots show values (“NA”) which did not pass various sleuth filters and should be ignored. Can be removed by “show_all=FALSE”.
First we prepare the data in a convenient table of all.
tab.all<-tab.plot %>% mutate("filt_out"=is.na(b), "type"=as.factor(ifelse(str_detect(target_id, "\\|"),"TE","RefSeq")))
Number of RepBase (TE) and other (protein-coding genes,ncRNA, etc) entries from RefSeq (all genes) analyzed; some were filtered out because they didn’t pass sleuth criteria (usually very low read counts or large discrepancy between reps).
tab.all %>% group_by(type, filt_out) %>% tally() %>% arrange(filt_out)
## # A tibble: 4 x 3
## # Groups: type [2]
## type filt_out n
## <fct> <lgl> <int>
## 1 RefSeq FALSE 17975
## 2 TE FALSE 166
## 3 RefSeq TRUE 15330
## 4 TE TRUE 72
Number of significantly changed transcripts (qval 0.05, LRT or Walt Test; logFC (onscaled_reads_per_base)):
#Wald test
tab.all %>% filter(abs(log2FC.count)>1, qval.wald<0.05) %>% mutate("change"=ifelse(log2FC.count>0,"up", "down")) %>% group_by(type,change) %>% tally() %>% arrange(desc(change))
## # A tibble: 4 x 3
## # Groups: type [2]
## type change n
## <fct> <chr> <int>
## 1 RefSeq up 187
## 2 TE up 36
## 3 RefSeq down 159
## 4 TE down 2
#LRT
tab.all %>% filter(abs(log2FC.count)>1, qval.lrt<0.05) %>% mutate("change"=ifelse(log2FC.count>0,"up", "down")) %>% group_by(type,change) %>% tally() %>% arrange(desc(change))
## # A tibble: 4 x 3
## # Groups: type [2]
## type change n
## <fct> <chr> <int>
## 1 RefSeq up 56
## 2 TE up 23
## 3 RefSeq down 64
## 4 TE down 2
From 17975 RefSeq transcripts and 166 RebBase entries which passed the initial filters, 187(56) and 36(23) are significantly up-regulated more than 2-fold in smt3 KD, and 159(64) or 2 are down-regulated more than 2-fold (qvalue 0.05, Wald test or LTR test).
Therefore, smt3 KD led to >2 fold up/down -regulation for ~1% of the transcripts (protein-coding and ncRNAs), and up-regulation of ~20% of the TEs.
R1 and R2 - from relatively lowly expressed genes in the control - become among the top 20 most abundant products in SUMO KD. See below:
Although many txs change, it is evident from the plots mostly modestly expressed ones show large FC, except for R1 and R2. If we filter txs with over 100 times change in expression log2FC>=~6.64, significant in at least one test:
tab.all %>% filter(log2FC.count>log2(100)) %>% filter(qval.wald<0.05 | qval.lrt<0.05) %>%
select(target_id, shW.1.count, shW.2.count, shSmt3.1.count, shSmt3.2.count, log2FC.count, type, b, qval.wald, qval.lrt) %>% mutate_at(2:5, funs(round(.,3))) %>% arrange(-( shSmt3.1.count*0.5+shSmt3.2.count*0.5)) %>% datatable(options=list(pageLength=5))
Results show that all other genes, apart from R1 and R2 in this set, are ones that practically have no reads in the control, and just few reads in the KD.
Numbers below show the expression of R1 and R2 elements; including rank of the average values. “X” and “Y” are average per control and KD, respectively:
*Note: Values can be seen as transcripts per million (TPM), see this explanation of TPM , or just counts.
tab.all.ranks1<-tab.all %>% filter(!filt_out) %>% select(target_id, matches("sh.*count"),matches("(x|y).count")) %>% mutate(ctrl.rank.count=rank(desc(x.count)), kd.rank.count=rank(desc(y.count)))
tab.all.ranks2<-tab.all %>% filter(!filt_out) %>% select(target_id, matches("sh.*tpm"),matches("(x|y).tpm")) %>% mutate(ctrl.rank.tpm=rank(desc(x.tpm)), kd.rank.tpm=rank(desc(y.tpm)))
tab.all.ranks1 %>% filter(str_detect(target_id, "^R[12]_"))
## target_id shW.1.count shSmt3.1.count shW.2.count shSmt3.2.count x.count y.count ctrl.rank.count kd.rank.count
## 1 R1_DM|Drosophila_fruit_fly_genus 239.2480 348313.7 243.4254 201125.2 241.3367 274719.5 9076 1
## 2 R2_DM|Drosophila_fruit_fly_genus 629.5107 194856.4 621.1112 197785.8 625.3109 196321.1 5702 4
tab.all.ranks2 %>% filter(str_detect(target_id, "^R[12]_"))
## target_id shW.1.tpm shSmt3.1.tpm shW.2.tpm shSmt3.2.tpm x.tpm y.tpm ctrl.rank.tpm kd.rank.tpm
## 1 R1_DM|Drosophila_fruit_fly_genus 2.740584 4013.985 2.790111 2315.011 2.765347 3164.498 11553 18
## 2 R2_DM|Drosophila_fruit_fly_genus 10.911782 3397.955 10.772657 3444.922 10.842219 3421.439 7863 16
Notably, R1 and R2 change from relatively lowly expressed elements, to the top 20 most abundant transcripts (in TPM). No other elements shows such a dramatic increase.
Numbers of genes and TEs that pass the filters (same as in the gene-mode analysis)
tab.gene.stat<-full_join(sleuth_results_LRT_gene, sleuth_results_wald_gene, by="target_id", suffix=c(".lrt",".wald")) %>% mutate("filt_out"=is.na(qval.lrt), "type"=as.factor(ifelse(str_detect(target_id, "\\|"),"TE","RefSeq")))
tab.gene.stat %>% group_by(type, filt_out) %>% tally() %>% arrange(filt_out) %>% as.data.frame()
## type filt_out n
## 1 RefSeq FALSE 8996
## 2 TE FALSE 166
## 3 RefSeq TRUE 7616
## 4 TE TRUE 72
Number of significantly changed elements, either direction:
#LRT
tab.gene.stat %>% filter(qval.lrt<0.05) %>% group_by(type, filt_out) %>% tally() %>% arrange(filt_out) %>% as.data.frame()
## type filt_out n
## 1 RefSeq FALSE 773
## 2 TE FALSE 51
#Wald test
tab.gene.stat %>% filter(qval.wald<0.05) %>% group_by(type, filt_out) %>% tally() %>% arrange(filt_out) %>% as.data.frame()
## type filt_out n
## 1 RefSeq FALSE 1280
## 2 TE FALSE 59
To introduce a fold-change cutoff here, we can use the original transcript counts. Note that each transcript has different fold change. Here, I extract genes based on aggregated qvals < 0.05, and for which at leat 1 transcript has >2-fold change.
table.counts<-left_join(table.counts, t2g, by="target_id")
#LRT
sign<-tab.gene.stat %>% filter(qval.lrt<0.05)
merged<-inner_join(table.counts, sign, by=c("gene"="target_id"))
merged %>% filter(abs(log2FC)>1) %>% mutate("change"=ifelse(log2FC>0,"up","down")) %>% select(gene, type, change) %>% distinct() %>% group_by(type, change) %>% tally() %>% arrange(change) %>% data_frame()
## # A tibble: 4 x 3
## type change n
## <fct> <chr> <int>
## 1 RefSeq down 221
## 2 TE down 2
## 3 RefSeq up 209
## 4 TE up 35
#Wald test
sign<-tab.gene.stat %>% filter(qval.wald<0.05)
merged<-inner_join(table.counts, sign, by=c("gene"="target_id"))
merged %>% filter(abs(log2FC)>1) %>% mutate("change"=ifelse(log2FC>0,"up","down")) %>% select(gene, type, change) %>% distinct() %>% group_by(type, change) %>% tally() %>% arrange(change) %>% data_frame()
## # A tibble: 4 x 3
## type change n
## <fct> <chr> <int>
## 1 RefSeq down 333
## 2 TE down 3
## 3 RefSeq up 345
## 4 TE up 37